In [ ]:
# Import the packages that will be usefull for this part of the lesson
from collections import OrderedDict, Counter
import pandas as pd
from pprint import pprint
# Small trick to get a larger display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
Think about the most efficient way to parse the file to get the information you want
... Now you can parse the file
Exercise 1: Randomly selected global event with catastrophic consequences
In [ ]:
from random import choice
disaster_list = ["Global_pandemic", "Brexit", "US_presidential_elections", "Nuclear_war", "Asteroide_storm", "End of_antibiotics_era"]
choice(disaster_list)
The file indicated bellow contain a representative sample of popular votes for the last US presidential elections. Parse the file and return a count of the number of occurrence of each name
In [ ]:
file = "../data/US_election_vote_sample.txt"
In [ ]:
In [ ]:
Exercise 2: Parsing a gene annotation file (gff3)
Parse The following gff3 gene annotation file. Extract the type of each line and print the count ordered by descending occurrence
gff3 is a standard genomic format. Read the documentation here.
In [ ]:
file = "../data/gencode_sample.gff3"
In [ ]:
! head -2 ../data/gencode_sample.gff3 # check format of GFF file
In [ ]:
# rearrange the lines below to create a working solution
type_field = line.split('\t')[2]
type_counter = Counter()
print('%s:\t%d' % count_info)
with open(file, 'r') as fh:
for line in fh.readlines():
for count_info in type_counter.most_common():
from collections import Counter
type_counter[type_field] += 1
Exercise 3:
Parse The following gff3 gene annotation file. For each seqid (chromosome) list the sequences ID associated. Use an ordered dict to preserve the original chromosome ordering from the file
Example:
d={"chr1": ["stop_codon:ENST00000370551.8", "UTR5:ENST00000235933.10", ...], "chr2": ["ID=CDS:ENST00000504764.5", "ID=UTR5:ENST00000480736.1", ...], ... }
In [ ]:
file = "../data/gencode_sample.gff3"
! head -2 ../data/gencode_sample.gff3
In [ ]:
# fill in the blanks (---) in the code below to create a working solution
from collections import ---
sequences = OrderedDict()
with open(file, ---) as fh:
--- line in fh.readlines():
fields = line.split()
seqid = fields[---]
attributes = ---
ID = attributes.split(';')[0]
if --- in sequences:
sequences[seqid].append(ID)
else:
sequences[seqid] = [ID]
for seq, ids in sequences.items():
print('%s:\t%s' % (seq, ', '.join(ids)))
Exercise 4:
1) Write a generator that can ouput DNA bases with the following frequencies A/T = 0.19 and C/G = 0.31
In [ ]:
# Import the uniform method (which is also a generator by the way) to generate random floats
from random import uniform
def DNA_generator (): # Eventually you can have options for relative nucleotide frequencies
# Calculate cummulative frequencies to avoid having to do it each time in the loop
---
---
---
---
# Iterate indefinitly... until a nuclear apocalypse at least.
while True:
# Generate a random float frequency between 0 and max freq
freq = uniform (0, ---)
# Depending on the random frequency return the approriate base
if --- :
yield "A"
elif ---:
yield "T"
elif ---:
yield "C"
else:
yield "G"
2) Using this generator to generate a 100nt sequence (as a string)
In [ ]:
d = DNA_generator()
---
Exercise 5:
Parse The following sam file. sam is a standard genomic format. Read the documentation here. It can be read as a table with pandas.
In [ ]:
file = "../data/sample_alignment.sam"
In [ ]:
import pandas as pd
df = pd.read_table(file, comment='@', header=None)
Which of the following code blocks will:
1) Print the 10 last rows?
# a)
df.head(10)
# b)
df.tail(10)
# c)
df.last10()
#d)
df[6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
In [ ]:
2) Sample randomly 10 rows and compute the mean and median fragment length (TLEN)?
# a)
sample = df.sample(10)
meanTLEN = sample[1].mean()
medianTLEN = sample[1].median()
# b)
sample = df.sample(10)
meanTLEN = sample[8].mean
medianTLEN = sample[8].median
# c)
sample = df.sample(10)
meanTLEN = sample[8].mean()
medianTLEN = sample[8].median()
In [ ]:
3) Generate a summary for all the columns?
# a)
pd.describe(df)
# b)
df.describe(include='all')
# c)
summarise(pd.df)
# d)
df.summarise('columns')
In [ ]:
Exercise 6:
Parse the following count file obtained by Kallisto from an RNAseq Dataset. The file is not a standard genomics format, but it is a tabulated file and some information can be found in Kallisto documentation.
In [ ]:
file = "../data/abundance.tsv"
In [ ]:
Exercise 7:
1) Create a dataframe associating the abundance data from the abundance_file and the gene symbols from gene_to_transcript_file.
In [ ]:
abundance_file = "../data/abundance.tsv")
In [ ]:
gsym_to_tid_file = "../data/gene_symbol_to_transcript_id.tsv"
In [ ]:
df1 =
df1.head()
In [ ]:
df2 =
df2.head()
In [ ]:
df3 =
In [ ]:
Exercise 7:
Create a random codon generator based on the known codon usage bias in the human genome.
The following file contains a list of all the human codon codons, their AA traduction, their count in the human genome and their frequency/1000: "../data/codon_usage_bias_human.tsv"
To do so, you can parse the file in a pandas DataFrame. Pandas has a very useful method to sample a line from the Dataframe. Use the weights option to sample according to a probability weighting.
In [ ]:
file = "../data/codon_usage_bias_human.tsv"
In [ ]:
In [ ]:
Exercise 1
In [ ]:
c = Counter()
# Open the file
with open ("../data/US_election_vote_sample.txt", "r") as fp:
for candidate in fp:
# Increment the counter for the current element
c[candidate]+=1
# Order by most frequent element
c.most_common()
Exercise 2
In [ ]:
file = "../data/gencode_sample.gff3"
c = Counter()
# Open the file
with open (file, "r") as fp:
# Iterate over lines
for line in fp:
# Split the line and get the element 3
feature_type = line.split("\t")[2]
# Increment the counter
c[feature_type]+=1
# Order by most frequent element
c.most_common()
Exercise 3
In [ ]:
!head -n 1 "../data/gencode_sample.gff3"
In [ ]:
file = "../data/gencode_sample.gff3"
d = OrderedDict()
# Open the file
with open (file, "r") as fp:
# Iterate over lines
for line in fp:
# Split the line and get the element 3
seqid = line.split("\t")[0]
# Parse the line to get the ID
ID = line.split("\t")[8].split(";")[0][3:]
#
if not seqid in d:
d[seqid] = []
d[seqid].append(ID)
d
Exercise 4
)1
In [ ]:
from random import uniform
def DNA_generator (A_freq, T_freq, C_freq, G_freq): # Customizable frequency argument
# Calculate cummulative frequencies to avoid having to do it each time in the loop
cum_A_freq = A_freq
cum_T_freq = A_freq+T_freq
cum_C_freq = A_freq+T_freq+C_freq
cum_G_freq = A_freq+T_freq+C_freq+G_freq
# Iterate indefinitly
while True:
# Generate a random float frequency between 0 and max freq
freq = uniform (0, cum_G_freq)
# Depending on the random frequency return the approriate base
if freq <= cum_A_freq:
yield "A"
elif freq <= cum_T_freq:
yield "T"
elif freq <= cum_C_freq:
yield "C"
else:
yield "G"
In [ ]:
# achieve the same using random.choices
# if using python v<3.6, import choices from numpy.random
from random import choices
def DNA_generator_choices(weights=[0.19, 0.31, 0.31, 0.19], letters=['A', 'C', 'G', 'T']):
while True:
yield choices(population=letters, weights=weights, k=1)[0]
In [ ]:
# Create the generator with the required frequencies
d = DNA_generator(A_freq=0.19, T_freq=0.19, C_freq=0.31, G_freq=0.31)
# Test the generator
print(next(d), next(d), next(d), next(d), next(d), next(d), next(d), next(d))
In [ ]:
# Create the generator with the required frequencies
d = DNA_generator_choices()
# Test the generator
print(next(d), next(d), next(d), next(d), next(d), next(d), next(d), next(d))
)2
In [ ]:
# iterative str contruction with a loop
seq=""
for _ in range (100):
seq += next(d)
seq
In [ ]:
# Same with a one liner list comprehension
seq = "".join([next(d) for _ in range (100)])
seq
Exercise 5
In [ ]:
file = "../data/sample_alignment.sam"
columns_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR', 'RNEXT', 'PNEXT', 'TLEN', 'SEQ', 'QUAL']
df = pd.read_table(file, sep="\t", names = columns_names, skiprows=[0,1], index_col=0)
In [ ]:
df.tail(10)
In [ ]:
tlen_sample = df.sample(10).TLEN
print (tlen_sample)
print ("\nMean:", tlen_sample.mean())
print ("\nMedian:", tlen_sample.median())
In [ ]:
df.describe(include="all")
Exercise 6
In [ ]:
file = "../data/abundance.tsv"
df = pd.read_table(file, index_col=0)
In [ ]:
df.loc[['ENST00000487368.4', 'ENST00000623229.1', 'ENST00000444276.1', 'ENST00000612487.4', 'ENST00000556673.2', 'ENST00000623191.1']]
In [ ]:
df[["est_counts", "tpm"]].head(10)
In [ ]:
df[(df.tpm > 10000)]
In [ ]:
df = df[(df.est_counts > 1000) & (df.tpm > 1000)]
df = df.sort_values("eff_length")
df.head(10)
Exercise 7
In [ ]:
df1 = pd.read_table("../data/abundance.tsv")
df2 = pd.read_table("../data/gene_symbol_to_transcript_id.tsv", names=["transcript_id", "gene_symbol"])
df3 = pd.merge(left=df1, right=df2, left_on="target_id", right_on="transcript_id", how="inner")
df3 = df3.sort_values("transcript_id")
df3 = df3.reset_index(drop=True)
df3.drop(["target_id"], axis=1)
df3.head()
Exercise 8
In [ ]:
print ("\x47\x6f\x6f\x64 \x4c\x75\x63\x6b")